/*==============================================================================
FILE NAME: Create_TitleV_Panel.do
INPUTS: TitleV_Permits_Clean.dta, Investigations_Clean.dta, investigations_with_noe_nov.dta, incidents_clean.dta, Notice_of_Violation_Clean.dta,
Enforcements_Clean.dta, county complaints.dta
OUTPUTS: TitleV_panel.dta
CREATED: 27 September 2023
UPDATED: 2 November 2023
==============================================================================*/

clear all

/* Set directory if working independently through code
if c(username)=="" { //insert username
	global rootdir "" // insert root path
	global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 
*/

//start putting together RN-by-month panel for Title V permit holders
//1. set up scaffolding for RN-by-month panel for RNs with TitleV permit; 204 months between Jan2003 & Dec2019
//create a file that includes the RNs of facilities that have a Title V permits
use "$processed_data/TitleV_Permits_Clean.dta", clear
//for now leave in all values of Permit_Status_T5 but we could consider dropping those with values of VOID and EXPIRED 
//drop if Permit_Status_T5=="VOID" | Permit_Status_T5=="EXPIRED"
keep RN TitleV
duplicates drop
sort RN
unique RN
//2590 unique RNs show up in TitleV permits data
//create RN-by-month panel structure for these RNs
sort RN
gen year=2003
gen month=1
gen mdate=ym(year,month)
format mdate %tm
drop year month TitleV
//need to create numeric identifier for RN so we can xtset the data
//drop "RN" and destring
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
isid RN_id
//build scaffolding
xtset RN_id mdate
tsappend, add(203)
drop RN
sort RN_id
sort RN_id mdate
save "$processed_data/TitleV_scaffold.dta", replace

//2. merge in investigations
//separate by complaint/noncomplaint investigation first
use "$processed_data/Investigations_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
destring IN, replace
sort IN
save "$processed_data/inv_temp.dta", replace

use "$processed_data/investigations_with_noe_nov.dta", clear
drop year
sort IN
merge 1:m IN using "$processed_data/inv_temp.dta"
keep if _merge==3
drop _merge
gen mdate=ym(year,month)
//for now don't distinguish by media, we can do this later if we'd like
keep IN complaint_inv investigation RN_id mdate day
duplicates drop
//at this point IN uniquely identifies obs
sort RN_id mdate day
//next step will give us # of investigations by RN-day
collapse (sum) investigation complaint_inv, by(RN_id mdate day)
//now create days investigated
sort RN_id mdate
gen day_inv=0
replace day_inv=1 if investigation>0
gen day_complaint_inv=0
replace day_complaint_inv=1 if complaint_inv>0
gen day_nocomplaint_inv=0
replace day_nocomplaint_inv=1 if investigation>complaint_inv
keep RN_id mdate day_inv day_complaint_inv day_nocomplaint_inv
sort RN_id mdate 
collapse (sum) day_inv day_complaint_inv day_nocomplaint_inv, by(RN_id mdate)
label var day_inv "# days RN had >=1 investigation in month"
label var day_complaint_inv "# days RN had >=1 complaint investigation in month"
label var day_nocomplaint_inv "# days RN had >=1 non-complaint investigation in month"
sort RN_id mdate
//restrict to 2003 to 2019
format mdate %tm
keep if mdate>=tm(2003m1) & mdate<=tm(2019m12)
merge 1:1 RN_id mdate using "$processed_data/TitleV_scaffold.dta"
drop if _merge==1
drop _merge
foreach x of varlist day_inv-day_nocomplaint_inv{
replace `x'=0 if `x'==.
}
sort RN_id mdate
gen p_inv=(day_inv>0)
label var p_inv "=1 if RN had at least one investigation in month"
gen p_complaint_inv=(day_complaint_inv>0)
label var p_complaint_inv "=1 if RN had at least one complaint-related investigation in month"
gen p_nocomplaint_inv=(day_nocomplaint_inv>0)
label var p_nocomplaint_inv "=1 if RN had at least one non-complaint-related investigation in month"
save "$processed_data/TitleV_panel_inv.dta", replace

//let's see if RNs that were never investigated have void Title V permits
use "$processed_data/TitleV_panel_inv.dta", clear
rename RN_id RN
tostring RN, replace
replace RN="RN"+RN
collapse (sum) day_inv, by(RN)
unique RN if day_inv==0
//there are 188 RNs that show up in Title V file but don't show up in investigations file
keep if day_inv==0
keep RN
merge 1:m RN using "$processed_data/TitleV_Permits_Clean.dta"
keep if _merge==3
drop _merge
tab Permit_Status_T5
//no strong pattern

//3. merge in complaints
//first measure complaint days like for investigations
use "$processed_data/incidents_clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
drop if year==.
keep CIN RN_id day month year incident
duplicates drop
//at this point CIN uniquely identifies observations
sort RN_id year month day
by RN_id year month day: gen nobs=_N
tab nobs
drop nobs
gen mdate=ym(year,month)
format mdate %tm
sort RN_id mdate day
//this will give us # of complaints by RN-day
collapse (sum) incident, by(RN_id mdate day)
//now create complaint days
sort RN_id mdate
gen day_incident=0
replace day_incident=1 if incident>0
keep RN_id mdate day_incident
sort RN_id mdate 
collapse (sum) day_incident, by(RN_id mdate)
label var day_incident "# days RN had >=1 incidents in month"
//restrict to 2003 to 2019
keep if mdate>=tm(2003m1) & mdate<=tm(2019m12)
sort RN_id mdate
merge 1:1 RN_id mdate using "$processed_data/TitleV_panel_inv.dta"
unique RN_id if _merge==3
//712 unique RNs with Title V permits show up in complaint data
drop if _merge==1
drop _merge
replace day_incident=0 if day_incident==.
sort RN_id mdate
gen p_incident=(day_incident>0)
label var p_incident "=1 if RN had at least one complaint incident in month"
save "$processed_data/TitleV_panel_inv_inc.dta", replace
//now create count of complaints in month
use "$processed_data/incidents_clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
drop if year==.
keep CIN RN_id month year incident
duplicates drop
//at this point CIN uniquely identifies observations
gen mdate=ym(year,month)
format mdate %tm
sort RN_id mdate
//this will give us # of complaints in the month
collapse (sum) incident, by(RN_id mdate)
label var incident "# of complaint incidents against RN in month"
//restrict to 2003 to 2019
keep if mdate>=tm(2003m1) & mdate<=tm(2019m12)
sort RN_id mdate
merge 1:1 RN_id mdate using "$processed_data/TitleV_panel_inv_inc.dta"
unique RN_id if _merge==3
//712 unique RNs with Title V permits show up in complaint data
drop if _merge==1
drop _merge
replace incident=0 if incident==.
sort RN_id mdate
save "$processed_data/TitleV_panel_inv_inc.dta", replace

//4. merge in novs
use "$processed_data/Notice_of_Violation_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
//day month year variables in this file reflect timing of investigation not nov
drop day month year
gen day = day(VN_Date)
gen month = month(VN_Date)
gen year = year(VN_Date)
keep RN_id VN day month year
gen mdate=ym(year,month)
format mdate %tm
duplicates drop
//the same RN VN appears on different dates
//create a count of unique VNs for each RN-month
keep RN_id VN mdate
duplicates drop
sort RN_id mdate
collapse (count) VN, by(RN_id mdate)
rename VN novs
label var novs "number of NOVs received by RN in month"
sort RN_id mdate
merge 1:1 RN_id mdate using "$processed_data/TitleV_panel_inv_inc.dta"
unique RN_id if _merge==3
drop if _merge==1
drop _merge
replace novs=0 if novs==.
sort RN_id mdate
gen p_nov=(novs>0)
label var p_nov "=1 if RN had at least one NOV in month"
save "$processed_data/TitleV_panel_inv_inc_nov.dta", replace

//5. merge in noes
use "$processed_data/Enforcements_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
keep RN_id VN day month year
gen mdate=ym(year,month)
format mdate %tm
duplicates drop
//the same RN VN appears on different dates
//create a count of unique VNs for each RN-month
keep RN_id VN mdate
duplicates drop
sort RN_id mdate
collapse (count) VN, by(RN_id mdate)
rename VN noes
label var noes "number of NOEs received by RN in month"
sort RN_id mdate
merge 1:1 RN_id mdate using "$processed_data/TitleV_panel_inv_inc_nov.dta"
unique RN_id if _merge==3
drop if _merge==1
drop _merge
replace noes=0 if noes==.
sort RN_id mdate
gen date=dofm(mdate)
gen month=month(date)
gen year=year(date)
drop date
gen p_noe=(noes>0)
label var p_noe "=1 if RN had at least one NOE in month"
save "$processed_data/TitleV_panel.dta", replace

//6. merge county back in so that counts of total complaints by county-month can be added
use "$processed_data/TitleV_Permits_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
keep RN_id County_T5
rename County_T5 county
duplicates drop
sort RN_id
by RN_id: gen nobs=_N
by RN_id: gen num=_n
tab nobs
//16 RNs are associated with more than one county. suspect this is because of expired permit but for now keep first county and flag
gen county_flag=1 if nobs>1
label var county_flag "=1 if RN is associated with more than one county in Title V data"
drop if nobs>1 & num>1
drop nobs num
sort RN_id
save "$processed_data/temp.dta", replace

use "$processed_data/TitleV_panel.dta"
sort RN_id
merge m:1 RN_id using "$processed_data/temp.dta"
drop _merge
sort county mdate
merge m:1 county mdate using "$processed_data/county complaints.dta"
drop if _merge==2
replace county_complaint=0 if _merge==1
drop _merge
sort RN_id mdate
gen other_complaint=county_complaint-incident
label var other_complaint "# of complaints against other RNs in the county-month"
save "$processed_data/TitleV_panel.dta", replace
//note that it's possible for an RN to have a different county in the Title V data than in the complaints data
//for example RN102420684 is in Glassock in complaints data but in Howard in Title V data

erase "$processed_data/temp.dta"
erase "$processed_data/TitleV_panel_inv_inc.dta"
erase "$processed_data/TitleV_panel_inv_inc_nov.dta"
erase "$processed_data/TitleV_panel_inv.dta"
erase "$processed_data/inv_temp.dta"
erase "$processed_data/TitleV_scaffold.dta"
